Customisable Prediction Accuracy Plots¶

In this notebook, any specified model which adheres to the sklearn API can be fitted across customisable training windows, with predictions made and plotted for the specified test window. Models are persisted to disk for quickly repeatable runs.

In [ ]:
import os
from joblib import dump, load
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as plt
from tscv import GapRollForward
from tqdm.notebook import tqdm

# --- notebook parameters: import, choose model, set hyperparameters
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, 
    AdaBoostRegressor, HistGradientBoostingRegressor)
from sklearn.neighbors import KNeighborsRegressor

# specify load and weather data details
DATA_PATH = '../data'
REGIONS = ['nsw', 'sa']
DATA_FILENAME = 'merged.csv'
OBS_PER_DAY = 24
X_EXCLUDE = ['datetime', 'pv_est', 'net_load', 'total_load']

HOLIDAY_FILENAME = 'australian-public-holidays-combined-2021-2024.csv'

# for convenience below
obs_year, obs_week = OBS_PER_DAY * 365, OBS_PER_DAY * 7

# specify methodology and model parameters
TRAIN_BEGIN = '2020-01-01'
TRAIN_MIN_SIZE = obs_year   # change for expanding window
TRAIN_MAX_SIZE = obs_year   # change for expanding window (np.inf)
TEST_MIN_SIZE = obs_week
TEST_MAX_SIZE = obs_week
TEST_FINAL_N = None         # set to None for rolling test window, or n to test final observations
ROLL_SIZE = obs_week

MODEL_SELECTION = 'hgb'

MODELS_DEFINITION = {
    'rf': {
        'class': RandomForestRegressor,
        'kwargs': {'n_jobs': 8}
    },
    'hgb': {
        'class': HistGradientBoostingRegressor,
        'kwargs': {} # capable of quantile loss, l2reg
    },
    'gb': {'class': GradientBoostingRegressor},
    'ada': {'class': AdaBoostRegressor},
    'knn': {'class': KNeighborsRegressor}
}
# --- end notebook parameters
In [ ]:
# prep model and persistence
MODEL = MODELS_DEFINITION[MODEL_SELECTION]['class']
MODEL_TYPE = str(MODEL).split("'")[1].split('.')[-1] # only for plot display
MODEL_KWARGS = MODELS_DEFINITION[MODEL_SELECTION].get('kwargs', {})
MODEL_PERSISTENCE_DIRS = {R: os.path.join('../models', R, MODEL_SELECTION) for R in REGIONS}
for dir in MODEL_PERSISTENCE_DIRS.values():
    os.makedirs(dir, exist_ok=True)

# extract holidays from file
holiday_df = pd.read_csv(os.path.join(DATA_PATH, HOLIDAY_FILENAME), dtype='str')
holiday_df['Date'] = holiday_df['Date'].astype('datetime64[ns]')

# import and preprocess data
dfs = {}
for R in REGIONS:
    full_data_path = os.path.join(DATA_PATH, R, DATA_FILENAME)
    df = pd.read_csv(os.path.relpath(full_data_path))
    df['datetime'] = df['datetime'].astype('datetime64')
    dt = df['datetime'].dt
    df['year'] = dt.year
    df['month'] = dt.month
    df['day'] = dt.day
    df['hour'] = dt.hour
    df['minute'] = dt.minute
    df['day_of_week'] = dt.day_of_week
    df['week'] = dt.isocalendar().week

    holidays = holiday_df.loc[holiday_df['Jurisdiction'] == R, ['Date', 'Holiday Name']]
    df['holiday'] = dt.floor('D').isin(holidays['Date'])
    dfs[R] = df
#pd.merge(dt.floor('D'), holidays, left_on='datetime', right_on='Date', how='left').fillna('No Holiday')['Holiday Name'].unique()
In [ ]:
prdfs = []
for R in REGIONS:
    persistence_dir = MODEL_PERSISTENCE_DIRS[R]
    df = dfs[R]

    # compute X and y column indices
    X_cols = np.setdiff1d(df.columns.values, X_EXCLUDE)
    X_inds = sorted(df.columns.get_indexer_for(X_cols))
    y_ind = df.columns.get_loc('net_load')

    # create train/test window strategy
    df_subset = df[df.datetime >= TRAIN_BEGIN]
    tscv = GapRollForward(min_train_size=TRAIN_MIN_SIZE, max_train_size=TRAIN_MAX_SIZE,
                        min_test_size=TEST_MIN_SIZE, max_test_size=TEST_MAX_SIZE,
                        roll_size=ROLL_SIZE)
    print(sum(1 for i in tscv.split(df_subset)), f'{R} models to be loaded/trained')

    # execute train/test window strategy
    for i, (train_ind, test_ind) in tqdm(enumerate(tscv.split(df_subset))):
        if TEST_FINAL_N:
            test_ind = range(-TEST_FINAL_N, 0)

        X_train, X_test = df_subset.iloc[train_ind, X_inds], df_subset.iloc[test_ind, X_inds]
        y_train, y_test = df_subset.iloc[train_ind, y_ind], df_subset.iloc[test_ind, y_ind]

        # train or load model
        begin, end = df_subset.iloc[[train_ind[0], train_ind[-1]], 0].dt.date
        argstring = '_'.join([f'{k}={v}' for k, v in MODEL_KWARGS.items()])
        x_ind_string = '-'.join([str(x) for x in X_inds])
        model_filename = f'{begin}_{OBS_PER_DAY}_{end}_{argstring}_{x_ind_string}.joblib'
        model_filepath = os.path.join(persistence_dir, model_filename)
        try:
            model = load(model_filepath)
        except FileNotFoundError:
            model = MODEL(**MODEL_KWARGS)
            model.fit(X_train, y_train)
            dump(model, model_filepath)

        # predict
        prd = model.predict(X_test)
        prdf = pd.DataFrame({'datetime': df_subset.iloc[test_ind, 0],
                            'region': R,
                            'model': i,
                            'train_end': end,
                            'predicted': prd,
                            'net_load': y_test})
        prdfs.append(prdf)

# concatenate predictions and compute discrete error metrics
predictions = pd.concat(prdfs)
predictions['Residual'] = predictions['predicted'] - predictions['net_load']
predictions['Absolute Error'] = predictions['Residual'].abs()
predictions['Percent Error'] = predictions['Residual'] / predictions['net_load']
predictions['Absolute Percent Error'] = predictions['Percent Error'].abs()
predictions['Squared Error'] = predictions['Residual'] ** 2
prediction_summary = predictions.groupby(['region', 'train_end']).describe()
131 nsw models to be loaded/trained
0it [00:00, ?it/s]
113 sa models to be loaded/trained
0it [00:00, ?it/s]
In [ ]:
import plotly.express as px
import plotly.offline
plotly.offline.init_notebook_mode()

# massage prediction summary into long-form with desired quartiles
preds = prediction_summary.stack([0, 1]).loc[:, :, 
    ['Residual', 'Absolute Error', 'Percent Error', 'Absolute Percent Error', 'Squared Error'],
    ['max', '75%', '50%', '25%', 'min']
    ].reset_index(name='value').rename(
        {'level_2': 'metric', 'level_3': 'quartile'}, axis=1)

# compute aggregate metrics
regional_preds = predictions.groupby('region')
mapes = regional_preds['Absolute Percent Error'].mean()
maes = regional_preds['Absolute Error'].mean()
rmses = regional_preds['Squared Error'].mean() ** 0.5

# for pretty aggregate metric display in subplot titles
def subfig_title(a):
    facet_name = a.text.split("=")[-1]
    if facet_name not in REGIONS:
        a.update(text=facet_name)
        return
    mape, mae, rmse = (x[facet_name] for x in [mapes, maes, rmses])
    a.update(text=f'{facet_name} MAPE={mape:.2%} MAE={mae:.1f} RMSE={rmse:.1f}')

fig = px.line(preds, x='train_end', y='value', color='quartile', facet_col='region',
    facet_row='metric', title=MODEL_TYPE, height=800, template='ggplot2',
    facet_col_spacing=0.08)
fig.update_yaxes(matches=None, title='', showticklabels=True)
fig.for_each_annotation(subfig_title)
fig.show()